Show the code
# Load required libraries
library(tidyverse)
library(ggplot2)
library(knitr)
library(kableExtra)
library(data.table)
library(gridExtra) # For arranging multiple plotsEstimating False Discovery Rates in Strategy Development
# Load required libraries
library(tidyverse)
library(ggplot2)
library(knitr)
library(kableExtra)
library(data.table)
library(gridExtra) # For arranging multiple plotsIn this tutorial, we will explore the problem of false discoveries in quantitative finance, specifically focusing on how multiple testing leads to selection bias in strategy development. We will:
This exercise will provide hands-on experience with the concepts discussed in the lecture on backtest overfitting.
Financial strategy development differs fundamentally from many scientific disciplines:
When a researcher tests multiple strategies and selects only the best performer, they risk identifying patterns that exist purely by chance. Understanding false discovery rates is essential for distinguishing genuine market inefficiencies from statistical artifacts.
Let’s first create a function to generate random strategy returns. These will simulate strategies with no genuine edge (i.e., the true Sharpe ratio is zero), as well as some strategies with a small genuine edge.
# Function to generate random strategy returns
# This simulates the returns of investment strategies that have no real edge
# Modified function to generate strategies, some with genuine edge
generate_random_strategies <- function(n_strategies = 1000,
n_returns = 252,
mean_return = 0,
sd_return = 0.01,
edge_pct = 0.05, # Percentage of strategies with edge (typical in finance)
edge_size = 0.0005) # Size of daily edge (~12% annual)
{
# Input validation
if(!is.numeric(n_strategies) || n_strategies <= 0)
stop("n_strategies must be a positive number")
if(!is.numeric(n_returns) || n_returns <= 0)
stop("n_returns must be a positive number")
if(!is.numeric(edge_pct) || edge_pct < 0 || edge_pct > 1)
stop("edge_pct must be between 0 and 1")
# Create a matrix of random returns (normally distributed)
returns_matrix <- matrix(
rnorm(n_strategies * n_returns, mean = mean_return, sd = sd_return),
nrow = n_returns,
ncol = n_strategies
)
# Add edge to a subset of strategies
n_edge_strategies <- round(n_strategies * edge_pct)
if (n_edge_strategies > 0) {
# Add a small positive drift to create genuine edge
for (i in 1:n_edge_strategies) {
returns_matrix[, i] <- returns_matrix[, i] + edge_size
}
}
# Name each strategy for easier reference
colnames(returns_matrix) <- paste0("Strategy_", 1:n_strategies)
# Create a vector indicating which strategies have true edge
has_edge <- logical(n_strategies)
has_edge[1:n_edge_strategies] <- TRUE
return(list(
returns = returns_matrix,
has_edge = has_edge
))
}
# Generate strategies with some having genuine edge
set.seed(42)
strategy_data <- generate_random_strategies(
n_strategies = 1000,
n_returns = 252,
edge_pct = 0.05, # 5% of strategies have genuine edge
edge_size = 0.0005 # Small daily drift (about 12% annual return)
)
# Extract the returns matrix and edge information
random_strategies <- strategy_data$returns
true_edge <- strategy_data$has_edge
# Check the dimensions to confirm we have 252 days × 1000 strategies
dim(random_strategies)[1] 252 1000
Instructor Note: We use simplified assumptions (normal, IID returns) for clarity and teaching effectiveness. Real financial markets exhibit additional complexity (e.g., skewness, kurtosis, volatility clustering), which you might explore in more advanced exercises or projects.
library(sn) # Skew-normal package
generate_random_strategies <- function(n_strategies = 1000,
n_returns = 252,
mean_return = 0,
sd_return = 0.01,
skewness = 0, # set to non-zero for skewness
edge_pct = 0.05,
edge_size = 0.0005) {
returns_matrix <- matrix(
rsn(n_strategies * n_returns, xi = mean_return, omega = sd_return, alpha = skewness),
nrow = n_returns,
ncol = n_strategies
)
# Add edge to a subset of strategies
n_edge_strategies <- round(n_strategies * edge_pct)
if (n_edge_strategies > 0) {
returns_matrix[, 1:n_edge_strategies] <- returns_matrix[, 1:n_edge_strategies] + edge_size
}
colnames(returns_matrix) <- paste0("Strategy_", 1:n_strategies)
has_edge <- logical(n_strategies)
has_edge[1:n_edge_strategies] <- TRUE
return(list(returns = returns_matrix, has_edge = has_edge))
}
library(rugarch)
generate_garch_strategies <- function(n_strategies = 1000, n_returns = 252, edge_pct = 0.05, edge_size = 0.0005) {
spec <- ugarchspec(mean.model = list(armaOrder = c(0,0)),
variance.model = list(model = "sGARCH", garchOrder = c(1,1)),
distribution.model = "norm")
returns_matrix <- sapply(1:n_strategies, function(x) {
sim <- ugarchpath(spec, n.sim = n_returns)
fitted_returns <- as.numeric(fitted(sim))
return(fitted_returns / sd(fitted_returns) * 0.01) # standardized volatility
})
# Add edge to a subset of strategies
n_edge_strategies <- round(n_strategies * edge_pct)
if (n_edge_strategies > 0) {
returns_matrix[, 1:n_edge_strategies] <- returns_matrix[, 1:n_edge_strategies] + edge_size
}
colnames(returns_matrix) <- paste0("Strategy_", 1:n_strategies)
has_edge <- logical(n_strategies)
has_edge[1:n_edge_strategies] <- TRUE
return(list(returns = returns_matrix, has_edge = has_edge))
}Now, let’s calculate the Sharpe ratio for each of these strategies.
# Function to calculate NON-annualized Sharpe ratio
# Useful for statistical tests (e.g., Deflated Sharpe Ratio)
calculate_sharpe <- function(returns) {
sd_return <- sd(returns)
if (sd_return == 0) return(0)
mean(returns) / sd_return
}
# Function to calculate annualized Sharpe ratio
# Typically used in industry to report performance
calculate_sharpe_annualized <- function(returns, annualization_factor = 252) {
calculate_sharpe(returns) * sqrt(annualization_factor)
}# Compute non-annualized Sharpe ratios for DSR and statistical testing
sharpe_ratios <- apply(random_strategies, 2, calculate_sharpe)
# Compute annualized Sharpe ratios for reporting and visualization
sharpe_ratios_annualized <- sharpe_ratios * sqrt(252)
# Examine distribution statistics of annualized Sharpe ratios
summary(sharpe_ratios_annualized) Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.12341 -0.63281 -0.00652 0.02047 0.69419 3.22375
# Separate Sharpe ratios (annualized) for strategies with and without edge
edge_sharpes <- sharpe_ratios_annualized[true_edge]
no_edge_sharpes <- sharpe_ratios_annualized[!true_edge]
# Create data frame clearly labeling annualized Sharpe ratios for plotting
sharpe_df <- data.frame(
Annualized_Sharpe = sharpe_ratios_annualized,
Has_Edge = factor(true_edge, levels = c(FALSE, TRUE),
labels = c("No Edge", "True Edge"))
)
# Visualize clearly labeled Annualized Sharpe ratios
ggplot(sharpe_df, aes(x = Annualized_Sharpe, fill = Has_Edge)) +
geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
labs(
title = "Distribution of Annualized Sharpe Ratios for Random Strategies",
subtitle = paste("Mean =", round(mean(sharpe_ratios_annualized), 2),
"SD =", round(sd(sharpe_ratios_annualized), 2),
"| 5% of strategies have true edge"),
x = "Annualized Sharpe Ratio",
y = "Count",
fill = "Strategy Type"
) +
theme_minimal() +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
scale_fill_manual(values = c("No Edge" = "steelblue", "True Edge" = "darkred"))The histogram above shows the distribution of Sharpe ratios for our 1,000 randomly generated strategies. This is a critical visualization that illustrates several important concepts in quantitative finance:
Normal Distribution: Notice how the Sharpe ratios for strategies without edge follow an approximately normal distribution, centered near zero. This is exactly what we would expect when strategies have no genuine edge – their performance is purely random.
Standard Deviation: The standard deviation of Sharpe ratios tells us about the spread of performance metrics. This is crucial because it determines how impressive a “good” Sharpe ratio needs to be to stand out from random noise.
Range of Values: Even though most strategies are completely random (with no edge), we see Sharpe ratios ranging from approximately -3 to +3. In practical terms, a Sharpe ratio of +2 is typically considered excellent in the investment industry! Yet here we see several random strategies achieving this level purely by chance.
Strategies with Edge: The strategies with genuine edge (in dark red) tend to have higher Sharpe ratios on average, but there’s substantial overlap with the no-edge strategies. This illustrates why it’s so difficult to identify genuine strategies from random ones based on Sharpe ratio alone.
This distribution demonstrates why statistical significance is so important in strategy evaluation. If you tested just one strategy and it showed a Sharpe ratio of 1.5, you might be excited about its performance. However, this chart shows that among 1,000 random strategies, we’d expect several to show Sharpe ratios of 1.5 or higher simply due to chance.
This is precisely why multiple testing is so dangerous in quantitative finance. When researchers or traders test many strategy variations and report only the best performer, they are essentially “selecting” from the right tail of this distribution – capturing lucky outcomes rather than genuine edge.
In the next sections, we’ll explore exactly how the maximum Sharpe ratio increases with the number of trials, and how we can use the Deflated Sharpe Ratio to correct for this selection bias.
Take a moment to answer these questions:
Now, let’s simulate the process of strategy selection, where a researcher tests multiple strategies and selects the one with the highest Sharpe ratio.
# Find the maximum Sharpe ratio from our 1000 strategies
# This simulates what happens when a researcher selects only the best-performing strategy
max_sharpe <- max(sharpe_ratios_annualized)
max_sharpe_index <- which.max(sharpe_ratios_annualized)
# Check if the best-performing strategy has true edge
has_true_edge <- true_edge[max_sharpe_index]
# Print the maximum Sharpe ratio and which strategy achieved it
# This is often what would be presented in a backtest report or research paper
cat("Maximum Sharpe Ratio:", round(max_sharpe, 2),
"\nStrategy Index:", max_sharpe_index,
"\nHas True Edge:", has_true_edge, "\n")Maximum Sharpe Ratio: 3.22
Strategy Index: 851
Has True Edge: FALSE
# Define a function to calculate the expected maximum Sharpe ratio
# This implements the False Strategy Theorem from the lecture
# It predicts how high the max Sharpe should be just from random chance
expected_max_sharpe <- function(n_trials, mean_sr = 0, std_sr = 1) {
# Euler-Mascheroni constant (mathematical constant appearing in the theorem)
emc <- 0.57721566490153286060651209008240243104215933593992
# Expected maximum Sharpe ratio formula:
# The more trials we run, the higher this value becomes, even with random data
sr0 <- (1 - emc) * qnorm(p = 1 - 1/n_trials) +
emc * qnorm(1 - (n_trials * exp(1))^(-1))
# Adjust by the mean and standard deviation of the Sharpe ratio distribution
sr0 <- mean_sr + std_sr * sr0
return(sr0)
}
# Calculate the expected maximum Sharpe ratio for 1000 trials
# We use the actual standard deviation of our Sharpe ratios for accuracy
exp_max_sharpe <- expected_max_sharpe(1000, mean_sr = 0, std_sr = sd(sharpe_ratios_annualized))
# Compare the theoretical expectation with our actual observed maximum
# They should be relatively close if our strategies are truly random
cat("Expected Maximum Sharpe Ratio:", round(exp_max_sharpe, 2), "\n")Expected Maximum Sharpe Ratio: 3.23
cat("Actual Maximum Sharpe Ratio:", round(max_sharpe, 2), "\n")Actual Maximum Sharpe Ratio: 3.22
Let’s visualise how the maximum Sharpe ratio increases with the number of trials.
# Calculate expected maximum Sharpe ratio for different numbers of trials
# This helps us visualize how the "best strategy" improves just by testing more variations
trial_counts <- c(1, 5, 10, 50, 100, 500, 1000)
exp_max_sharpes <- sapply(trial_counts, expected_max_sharpe,
mean_sr = 0, std_sr = sd(sharpe_ratios_annualized))
# Function to simulate actual maximum Sharpe ratios for different trial counts
# This gives us an empirical check against the theoretical prediction
simulate_max_sharpe <- function(n_trials, n_simulations = 100,
mean_sr = 0, std_sr = 1) {
# We'll run multiple simulations to get a distribution of maximum Sharpe ratios
max_sharpes <- numeric(n_simulations)
# For each simulation, generate n_trials random Sharpe ratios and find the maximum
for (i in 1:n_simulations) {
# Generate random Sharpe ratios with specified mean and std
sharpes <- rnorm(n_trials, mean = mean_sr, sd = std_sr)
# Record the maximum value - this simulates selecting the best strategy
max_sharpes[i] <- max(sharpes)
}
# Return statistics about the distribution of maximum Sharpe ratios
return(list(
mean = mean(max_sharpes), # Average maximum across simulations
sd = sd(max_sharpes), # Standard deviation of the maximums
values = max_sharpes # All the individual maximum values
))
}
# Run the simulation for each trial count
# For each number of trials, we simulate 100 sets to get a good estimate
set.seed(123) # For reproducibility
simulated_results <- lapply(trial_counts, simulate_max_sharpe,
n_simulations = 100,
mean_sr = 0,
std_sr = sd(sharpe_ratios_annualized))
# Extract the mean maximum Sharpe ratio from each simulation set
sim_max_sharpes <- sapply(simulated_results, function(x) x$mean)
# Create a data frame for plotting both theoretical and simulated results
plot_data <- tibble(
Trials = rep(trial_counts, 2), # Each trial count appears twice (theoretical and simulated)
`Sharpe Ratio` = c(exp_max_sharpes, sim_max_sharpes), # Values from both methods
Type = rep(c("Theoretical", "Simulated"), each = length(trial_counts)) # Identify the source
)
# Create the plot comparing theoretical vs. simulated maximum Sharpe ratios
# This is a key visualization demonstrating how selection bias increases with trials
ggplot(plot_data, aes(x = Trials, y = `Sharpe Ratio`, color = Type)) +
geom_line(size = 1) +
geom_point(size = 3) +
scale_x_log10() + # Log scale makes the pattern clearer across different trial counts
labs(
title = "Maximum Sharpe Ratio vs. Number of Trials",
subtitle = "Theoretical (False Strategy Theorem) vs. Simulated",
x = "Number of Trials (log scale)",
y = "Maximum Sharpe Ratio"
) +
theme_minimal() +
scale_color_manual(values = c("Theoretical" = "darkblue", "Simulated" = "darkred"))The results above demonstrate one of the most important concepts in quantitative finance: the expected maximum Sharpe ratio increases systematically with the number of trials, even when all strategies have zero true edge.
Looking at our data:
This relationship is both theoretical (as predicted by the False Strategy Theorem) and empirical (as shown by our simulations). The close match between our theoretical and simulated lines confirms the validity of the theorem.
The implications for research are profound:
This is why proper statistical adjustments like the Deflated Sharpe Ratio (which we’ll explore next) are essential when evaluating investment strategies that resulted from backtesting multiple configurations.
Let’s visualise the selection bias problem more directly:
# Generate a set of Sharpe ratios for 100 random strategies
set.seed(456)
random_sharpes <- rnorm(100, mean = 0, sd = 1)
# Create a data frame for visualization
selection_bias_df <- data.frame(
Strategy = 1:100,
Sharpe = random_sharpes
)
# Plot the Sharpe ratios with the maximum highlighted
ggplot(selection_bias_df, aes(x = Strategy, y = Sharpe)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
geom_segment(aes(xend = Strategy, yend = 0), color = "steelblue", alpha = 0.5) +
geom_point(color = "steelblue", size = 2) +
geom_point(data = selection_bias_df[which.max(random_sharpes),],
color = "red", size = 4) +
annotate("text", x = which.max(random_sharpes),
y = max(random_sharpes) + 0.3,
label = paste("Max SR =", round(max(random_sharpes), 2)),
color = "red") +
labs(
title = "Selection Bias in Strategy Development",
subtitle = "100 random strategies with no true edge",
x = "Strategy Number",
y = "Sharpe Ratio"
) +
theme_minimal()This visualisation shows exactly what happens in strategy selection: we run many trials and pick the best one. The strategy with the highest Sharpe ratio (highlighted in red) looks impressive, but it’s purely a result of random chance. If we were to implement this strategy, we would likely be disappointed by its future performance.
Now, let’s implement the Deflated Sharpe Ratio to correct for selection bias under multiple testing.
# Function to calculate skewness and excess kurtosis
calculate_moments <- function(returns) {
z <- (returns - mean(returns)) / sd(returns)
skew <- mean(z^3)
kurt <- mean(z^4) - 3
return(list(skewness = skew, kurtosis = kurt))
}
# Function to calculate expected maximum Sharpe ratio (False Strategy Theorem)
expected_max_sharpe <- function(n_trials, mean_sr = 0, std_sr = 1) {
emc <- 0.577215664901532860606512090082402431042159336 # Euler-Mascheroni constant
sr0 <- (1 - emc) * qnorm(1 - 1 / n_trials) + emc * qnorm(1 - (n_trials * exp(1))^(-1))
return(mean_sr + std_sr * sr0)
}
calculate_dsr <- function(returns, n_trials, sr_mean = 0, sr_std = NULL, annual_factor = 252) {
n <- length(returns)
# Compute daily Sharpe ratio explicitly
sr_daily <- mean(returns) / sd(returns)
# Explicit annualisation
sr_annualised <- sr_daily * sqrt(annual_factor)
# If sr_std not provided, issue warning and default sensibly
if (is.null(sr_std) || sr_std <= 0) {
sr_std <- 1
warning("SR standard deviation not provided or invalid; using default value of 1 (annualised)")
}
moments <- calculate_moments(returns)
skew <- moments$skewness
kurt <- moments$kurtosis
# Use annualised expected maximum Sharpe ratio consistently
exp_max_sr_annualised <- expected_max_sharpe(n_trials, mean_sr = sr_mean, std_sr = sr_std)
numerator <- (sr_annualised - exp_max_sr_annualised) * sqrt(n - 1)
denominator_term <- 1 - skew * sr_daily + ((kurt + 2) / 4) * sr_daily^2
if (denominator_term <= 0 || is.na(denominator_term)) {
warning("Instability in denominator calculation; returning NA for DSR")
return(list(
sharpe_ratio = sr_annualised,
expected_max_sr = exp_max_sr_annualised,
dsr = NA,
skewness = skew,
kurtosis = kurt,
n_observations = n
))
} else {
denominator <- sqrt(denominator_term)
}
dsr <- pnorm(numerator / denominator)
return(list(
sharpe_ratio = sr_annualised,
expected_max_sr = exp_max_sr_annualised,
dsr = dsr,
skewness = skew,
kurtosis = kurt,
n_observations = n
))
}# Visualize the DSR calculation process
library(DiagrammeR)
# Create flow diagram of DSR calculation process
grViz("
digraph DSR_calculation {
# Node definitions
node [shape = rectangle, style = filled, fillcolor = lightblue, fontname = Helvetica]
A [label = 'Input: Strategy returns\\nand number of trials']
B [label = 'Calculate Sharpe ratio\\n(SR = mean/std)']
C [label = 'Calculate higher moments\\n(skewness, kurtosis)']
D [label = 'Estimate expected maximum SR\\n(False Strategy Theorem)']
E [label = 'Calculate DSR\\n(probability measure)']
F [label = 'Interpret DSR result\\n(probability of true discovery)']
# Edge definitions
A -> B
B -> C
{B; C} -> D
{B; C; D} -> E
E -> F
# Graph attributes
graph [rankdir = TB, splines = true, nodesep = 0.8]
}
")# Calculate DSR for our maximum Sharpe ratio strategy
# This applies our DSR function to the strategy that looked best in the backtest
best_strategy_returns <- random_strategies[, max_sharpe_index]
dsr_result <- calculate_dsr(
best_strategy_returns,
n_trials = 1000, # We tested 1000 strategies
sr_std = sd(sharpe_ratios_annualized) # Use the actual variability in our Sharpe ratios
)
# Print the results
# These metrics tell us whether our "best strategy" is likely genuine or just lucky
cat("Sharpe Ratio:", round(dsr_result$sharpe_ratio, 2), "(annualized)\n")Sharpe Ratio: 3.22 (annualized)
cat("Expected Max Sharpe Ratio:", round(dsr_result$expected_max_sr, 2), "\n")Expected Max Sharpe Ratio: 3.23
cat("Deflated Sharpe Ratio:", round(dsr_result$dsr, 4), "\n")Deflated Sharpe Ratio: 0.4428
cat("Skewness:", round(dsr_result$skewness, 2), "\n")Skewness: -0.16
cat("Excess Kurtosis:", round(dsr_result$kurtosis, 2), "\n")Excess Kurtosis: -0.27
The Deflated Sharpe Ratio (DSR) we’ve calculated represents the probability that our strategy’s performance is not merely due to selection bias. Let’s interpret our results:
Sharpe Ratio: Our best strategy has an annualized Sharpe ratio of 3.22. In traditional finance, this would be considered excellent performance, well above the typical threshold of 1.0 for investment consideration.
Expected Maximum Sharpe Ratio: However, given that we tested 1000 strategies, we would expect to find a maximum Sharpe ratio of approximately 3.20 purely by chance. This is a critical benchmark that any truly successful strategy must exceed.
Deflated Sharpe Ratio: The DSR value of 0.44 means there’s approximately a 44% probability that this high Sharpe ratio is genuinely indicative of a strategy’s true predictive power, rather than being merely due to luck from multiple tests. In practice, this means the strategy’s validity is quite uncertain—neither clearly false nor clearly genuine.
Higher Moments (Skewness & Kurtosis): With skewness of -0.16 and excess kurtosis of -0.27, our returns distribution closely approximates normality, indicating relatively stable strategy returns without extreme outliers. This reduces, though does not eliminate, uncertainty from extreme events influencing our Sharpe ratio.
In professional quantitative investment firms, DSR thresholds often guide strategy implementation decisions:
| DSR Range | Interpretation | Typical Action |
|---|---|---|
| 0.00-0.20 | Strong evidence of false discovery | Reject strategy |
| 0.20-0.50 | Likely false discovery or uncertain | Consider additional testing or validation on new data before implementation |
| 0.50-0.80 | Uncertain (moderate evidence) | Additional testing with independent data required |
| 0.80-0.95 | Potentially valid (good evidence) | Implement with caution, possibly at reduced scale |
| 0.95-1.00 | Strong evidence of valid strategy | Full implementation consideration |
With a DSR of 0.44, your current strategy would sit firmly in the “Likely false or uncertain” category, demanding careful further validation.
In the next section, we’ll explore how different parameters affect the false discovery rate, which will further illustrate why traditional backtesting approaches are so problematic.
Let’s analyze how the false discovery rate changes with different parameters.
# Function to calculate precision, recall, and False Discovery Rate (FDR)
# This implements the mathematical framework from Lopez de Prado discussed in the lecture
calculate_fdr <- function(ground_truth, alpha = 0.05, beta = 0.2) {
# Input validation
if(ground_truth < 0 || ground_truth > 1)
stop("ground_truth must be between 0 and 1")
if(alpha < 0 || alpha > 1)
stop("alpha must be between 0 and 1")
if(beta < 0 || beta > 1)
stop("beta must be between 0 and 1")
# Convert ground truth probability to odds ratio (theta)
# ground_truth = proportion of strategies that are truly profitable
# theta = ratio of true strategies to false strategies
theta <- ground_truth / (1 - ground_truth)
# Calculate recall (true positive rate)
# Recall = 1 - beta, where beta is the Type II error rate (false negative rate)
# This represents the probability of detecting a true strategy
recall <- 1 - beta
# Calculate numerator for precision calculation
# b1 = recall * theta = number of true positives / number of false strategies
b1 <- recall * theta
# Calculate precision
# Precision = true positives / all positives
# This represents the probability that a positive test indicates a true strategy
precision <- b1 / (b1 + alpha)
# Calculate False Discovery Rate (FDR)
# FDR = 1 - precision = false positives / all positives
# This is the probability that a strategy that tests positive is actually false
fdr <- 1 - precision
# Return all relevant metrics in a tidy format
return(tibble(
ground_truth = ground_truth, # Original input - prior probability of true strategy
theta = theta, # Odds ratio of true vs. false strategies
alpha = alpha, # Type I error rate (significance level)
beta = beta, # Type II error rate (1 - power)
recall = recall, # True positive rate (power)
precision = precision, # Proportion of positives that are true
fdr = fdr # Proportion of positives that are false
))
}
# Calculate FDR for different ground truth probabilities
# This shows how the FDR changes based on the prior probability of true strategies
ground_truths <- seq(0.01, 0.5, by = 0.01) # Try values from 1% to 50%
fdr_results <- map_df(ground_truths, calculate_fdr) # Apply function to each value
# Plot the results to visualize the relationship
# This is a key insight: even with standard statistical testing, FDR remains high
# when true strategies are rare (which is the case in finance)
ggplot(fdr_results, aes(x = ground_truth)) +
geom_line(aes(y = precision, color = "Precision"), size = 1) +
geom_line(aes(y = fdr, color = "FDR"), size = 1) +
scale_color_manual(values = c("Precision" = "blue", "FDR" = "red")) +
labs(
title = "Precision and False Discovery Rate vs. Ground Truth Probability",
subtitle = "Alpha = 0.05, Beta = 0.2",
x = "Ground Truth Probability (Proportion of True Strategies)",
y = "Rate",
color = "Metric"
) +
theme_minimal() +
scale_x_continuous(labels = scales::percent) + # Format x-axis as percentages
scale_y_continuous(labels = scales::percent) + # Format y-axis as percentages
# Add a vertical line at the point where precision = 0.5 (FDR = 0.5)
geom_vline(xintercept = 0.0625, linetype = "dashed", color = "black") +
annotate("text", x = 0.09, y = 0.5,
label = "Precision = 50%\nwhen ground truth ≈ 6.25%",
hjust = 0)Now, let’s see how the FDR changes with different significance levels (alpha).
# Calculate FDR for different alpha values (significance levels)
alphas <- seq(0.01, 0.1, by = 0.01) # Test alpha values from 1% to 10%
# Create combinations of ground truth probabilities and alphas
fdr_by_alpha <- expand.grid(
initial_ground_truth = c(0.01, 0.05, 0.1, 0.2), # Different prior probabilities
initial_alpha = alphas # Different significance levels
) %>%
as_tibble() %>%
rowwise() %>%
# Calculate FDR for each combination
mutate(
result = list(calculate_fdr(initial_ground_truth, initial_alpha, beta = 0.2))
) %>%
unnest(result) %>%
# Use the values from the calculate_fdr result, dropping the duplicates
select(-initial_ground_truth, -initial_alpha)
# Plot FDR vs alpha for different ground truth probabilities
ggplot(fdr_by_alpha, aes(x = alpha, y = fdr, color = factor(ground_truth))) +
geom_line(size = 1) +
labs(
title = "False Discovery Rate vs. Significance Level",
subtitle = "For Different Ground Truth Probabilities",
x = "Significance Level (Alpha)",
y = "False Discovery Rate",
color = "Ground Truth"
) +
theme_minimal() +
scale_y_continuous(labels = scales::percent) +
scale_color_discrete(name = "Ground Truth",
labels = scales::percent(unique(fdr_by_alpha$ground_truth)))The plots above reveal a profound challenge in quantitative finance - the false discovery rate is alarmingly high even with traditional statistical safeguards.
High False Discovery Rate with Low Prior Probability: When only 1% of tested strategies are genuinely profitable (a realistic scenario in finance), the false discovery rate is approximately 94% even when using the standard significance level of 5%. This means that 94% of “discoveries” are actually false!
The Prior Probability Problem: Notice how dramatically the FDR drops as the ground truth probability increases. In fields like medicine or physics, where the prior probability of a true effect might be 20-50%, traditional statistical methods work well. But in finance, where market efficiency makes true strategies rare, these methods break down.
The Reliability Threshold: The ground truth probability needs to exceed 6.25% before the false discovery rate drops below 50% (with α=0.05 and β=0.2). This means that without a strong prior belief in a strategy’s effectiveness, most statistically significant backtests are likely to be false.
Significance Level Impact: Making our statistical tests more stringent (lower α) does reduce the false discovery rate, but the effect is modest compared to the impact of the ground truth probability.
Diminishing Returns: Even with a very strict significance level of α=0.01, the false discovery rate remains above 80% when the ground truth probability is just 1%.
The Multiple Testing Connection: This analysis reveals why multiple testing is so problematic in finance - each test increases the opportunity for false positives in an environment where true positives are rare.
These findings help explain why so many published financial strategies fail to replicate and why institutional investors are rightfully skeptical of backtested performance. They also underscore the importance of methods like the Deflated Sharpe Ratio, which explicitly account for multiple testing and the low prior probability of true strategies.
Take a moment to reflect on these questions:
The false discovery rate is high in finance because the prior probability of true strategies (ground truth) is typically very low due to market efficiency. Even with strict statistical tests, if the base rate of true strategies is low, most “discoveries” will be false positives.
You would need to know: (a) how many strategies were tested before finding this one (to assess selection bias), (b) the prior probability of true strategies in your domain, and (c) the power of your test (1-β). With this information, you could calculate the true probability that your strategy is a false discovery.
To increase prior probability, you could: (a) develop strategies based on strong economic rationales rather than data mining, (b) focus on areas of the market with known inefficiencies, (c) incorporate non-public information (where legally permitted), and (d) apply domain expertise to filter strategy ideas before backtesting.
Let’s investigate how the sample size affects the Deflated Sharpe Ratio.
# Generate random strategies with different sample sizes
# This helps us understand how the amount of data affects our ability to detect true strategies
sample_sizes <- c(63, 126, 252, 504, 1008) # Approx. 3 months to 4 years of daily data
# Corrected function to calculate DSR for different sample sizes
calculate_dsr_by_sample <- function(sample_size, n_strategies = 100, n_trials = 100,
edge_pct = 0.05, edge_size = 0.0005) {
# Generate strategies
strategies_data <- generate_random_strategies(
n_strategies = n_strategies,
n_returns = sample_size,
edge_pct = edge_pct,
edge_size = edge_size
)
strategies <- strategies_data$returns
has_edge <- strategies_data$has_edge
# Calculate Sharpe ratios (annualised)
sharpes <- apply(strategies, 2, calculate_sharpe_annualized, annualization_factor = 252)
# Find max Sharpe and its index
max_sharpe <- max(sharpes)
max_idx <- which.max(sharpes)
# Check if max strategy has true edge
best_has_edge <- has_edge[max_idx]
# Calculate DSR for best strategy with correct parameters
dsr_result <- tryCatch({
calculate_dsr(
strategies[, max_idx],
n_trials = n_trials,
sr_std = sd(sharpes)
)
}, error = function(e) {
warning("Error in DSR calculation: ", e$message)
return(NULL)
})
if (is.null(dsr_result)) {
return(tibble(
sample_size = sample_size,
sharpe_ratio = NA_real_,
expected_max_sr = NA_real_,
dsr = NA_real_,
has_edge = best_has_edge
))
} else {
return(tibble(
sample_size = sample_size,
sharpe_ratio = dsr_result$sharpe_ratio, # Corrected: already annualised
expected_max_sr = dsr_result$expected_max_sr,
dsr = dsr_result$dsr,
has_edge = best_has_edge
))
}
}
# Run multiple simulations consistently (e.g., 100 strategies, 100 trials)
set.seed(456)
n_simulations <- 50
dsr_by_sample <- map_df(sample_sizes, function(sample_size) {
map_df(1:n_simulations, function(i) {
calculate_dsr_by_sample(sample_size, n_strategies = 100, n_trials = 100)
})
})
library(dplyr)
library(ggridges)
ggplot(dsr_by_sample, aes(x = factor(sample_size), y = dsr, fill = has_edge)) +
geom_violin(trim = FALSE, alpha = 0.6, position = position_dodge(width = 0.8)) +
geom_jitter(aes(colour = has_edge), width = 0.15, alpha = 0.8, size = 1, show.legend = FALSE) +
labs(
title = "Violin Plot of DSR",
subtitle = "True vs False Strategies by Sample Size",
x = "Sample Size (Days)",
y = "Deflated Sharpe Ratio (DSR)",
fill = "Strategy Has True Edge"
) +
theme_minimal() +
scale_fill_manual(values = c("FALSE" = "lightblue", "TRUE" = "darkred")) +
scale_colour_manual(values = c("FALSE" = "steelblue", "TRUE" = "darkred"))# # Plot results clearly
# ggplot(dsr_by_sample, aes(x = factor(sample_size), y = dsr, fill = has_edge)) +
# geom_boxplot() +
# labs(
# title = "Distribution of Deflated Sharpe Ratio by Sample Size",
# subtitle = "100 Strategies, 100 Trials, 50 Simulations per Sample Size",
# x = "Sample Size (Number of Returns)",
# y = "Deflated Sharpe Ratio",
# fill = "Strategy Has\nTrue Edge"
# ) +
# theme_minimal() +
# scale_fill_manual(values = c("FALSE" = "lightblue", "TRUE" = "darkred"))The violin plot visualization above provides critical insights into how sample size significantly influences our ability to accurately distinguish strategies with genuine predictive power (“true edge”) from false discoveries:
At smaller sample sizes (e.g., 63 days, approximately three months), the Deflated Sharpe Ratio (DSR) values exhibit substantial variability, illustrated by the wide distribution in the violin plots. This variability implies that short backtests can easily produce misleading conclusions about strategy quality, leading investors to mistakenly trust strategies that appear statistically sound but are in reality just fortunate anomalies.
As the sample size increases (moving from left to right across the violin plots), the variability of the DSR diminishes noticeably. By the time we reach one or more years of daily data (252 to 1008 days), the distributions become narrower and more clearly differentiated. This improvement reflects greater statistical precision, enabling more reliable assessments of strategy validity.
Strategies with true edges (highlighted in dark red) tend to maintain consistently higher DSR values as sample size grows. In contrast, false strategies (light blue) become increasingly concentrated around lower DSR values with larger sample sizes. This pattern demonstrates that the reliability of the DSR as a tool for discriminating genuinely effective strategies from those resulting from random chance improves significantly with more data.
This analysis clearly supports industry best practices requiring extensive backtest periods:
While longer backtests enhance statistical reliability, they inherently incorporate more diverse market conditions, potentially introducing regime shifts and structural changes. Thus, a long sample period can sometimes mask recent market dynamics or structural breaks. Investors must therefore balance rigorous statistical testing against the need to reflect current market conditions realistically.
In summary, the violin plot clearly illustrates that larger sample sizes significantly reduce the risk of false discoveries. Yet, the choice of sample length should thoughtfully incorporate statistical considerations alongside economic and practical market insights.
Let’s examine the relationship between statistical significance (Sharpe ratio), DSR, and sample size:
# Calculate average DSR for strategies with and without edge across sample sizes
avg_dsr_by_sample <- dsr_by_sample %>%
group_by(sample_size, has_edge) %>%
summarize(
avg_dsr = mean(dsr, na.rm = TRUE),
avg_sharpe = mean(sharpe_ratio, na.rm = TRUE),
.groups = 'drop'
)
# Plot the average DSR and Sharpe ratio by sample size
p1 <- ggplot(avg_dsr_by_sample, aes(x = factor(sample_size), y = avg_dsr,
color = has_edge, group = has_edge)) +
geom_line(size = 1) +
geom_point(size = 3) +
labs(
title = "Average DSR by Sample Size",
x = "Sample Size (Number of Returns)",
y = "Average DSR",
color = "Strategy Has\nTrue Edge"
) +
theme_minimal() +
scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red"))
p2 <- ggplot(avg_dsr_by_sample, aes(x = factor(sample_size), y = avg_sharpe,
color = has_edge, group = has_edge)) +
geom_line(size = 1) +
geom_point(size = 3) +
labs(
title = "Average Sharpe Ratio by Sample Size",
x = "Sample Size (Number of Returns)",
y = "Average Sharpe Ratio",
color = "Strategy Has\nTrue Edge"
) +
theme_minimal() +
scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red"))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, ncol = 2)The visualisations above provide complementary evidence of how strategy evaluation metrics evolve with increasing sample size:
The left plot clearly demonstrates that the average DSR for strategies with a genuine edge (in red) improves significantly with increased sample size. As we gather more data, genuine strategies become more clearly distinguishable from those without an edge (in blue). Importantly, false strategies do not significantly improve as the sample size increases, instead remaining near a neutral DSR value (~0.5), highlighting DSR’s effectiveness at mitigating false discoveries.
Conversely, the right-hand plot shows that the average Sharpe ratios for both true and false strategies converge quickly and closely as the sample size grows. This rapid convergence makes it difficult to differentiate genuine performance from random noise using the Sharpe ratio alone. The substantial drop in Sharpe ratios across both groups as data increases clearly illustrates the Sharpe ratio’s vulnerability to selection bias, particularly in smaller datasets.
This contrast strongly supports the use of DSR in backtest evaluation, particularly when making high-stakes investment decisions based on strategy performance. The analysis further underscores why industry-standard practice recommends multi-year backtests to reliably distinguish truly profitable strategies from those that merely appear promising due to statistical flukes.
Now, let’s simulate the performance of strategies selected based on different criteria to see how DSR predicts out-of-sample performance.
# Function to simulate out-of-sample performance
# This provides a realistic test of whether DSR predicts future performance
simulate_oos_performance <- function(n_strategies = 100,
in_sample_size = 252,
out_sample_size = 252,
edge_pct = 0.05,
edge_size = 0.0005) {
# Generate in-sample returns (this represents our "backtest" period)
# We'll use this data to select strategies and calculate DSR
strategy_data_in <- generate_random_strategies(
n_strategies = n_strategies,
n_returns = in_sample_size,
edge_pct = edge_pct,
edge_size = edge_size
)
in_sample <- strategy_data_in$returns
has_edge <- strategy_data_in$has_edge
# Calculate in-sample Sharpe ratios (what we'd see during strategy development)
# These are the performance metrics that would guide strategy selection
in_sample_sharpes <- apply(in_sample, 2, calculate_sharpe)
# Generate out-of-sample returns (this represents future performance)
# These are completely new random data, simulating what happens post-implementation
# But strategies with edge still have edge
out_sample <- matrix(
rnorm(n_strategies * out_sample_size, mean = 0, sd = 0.01),
nrow = out_sample_size,
ncol = n_strategies
)
# Add edge to the same strategies that had edge in-sample
for (i in which(has_edge)) {
out_sample[, i] <- out_sample[, i] + edge_size
}
# Calculate out-of-sample Sharpe ratios (the "true" performance we care about)
# This is what would actually be realized when trading the strategy
out_sample_sharpes <- apply(out_sample, 2, calculate_sharpe)
# Calculate DSR for each strategy based on in-sample data
# We use this to test whether DSR effectively predicts out-of-sample performance
dsrs <- numeric(n_strategies)
for (i in 1:n_strategies) {
dsr_result <- calculate_dsr(
in_sample[, i],
n_trials = n_strategies, # Number of trials equals number of strategies
sr_std = sd(in_sample_sharpes) # Use actual variation in Sharpe ratios
)
dsrs[i] <- dsr_result$dsr
}
# Combine results into a single dataset for analysis
results <- tibble(
strategy = 1:n_strategies,
in_sample_sharpe = in_sample_sharpes,
out_sample_sharpe = out_sample_sharpes,
dsr = dsrs,
has_edge = has_edge
)
return(results)
}
# Run simulation with 100 strategies
# This gives us enough data to analyze the relationship between DSR and future performance
set.seed(789) # For reproducibility
performance_results <- simulate_oos_performance(n_strategies = 100)
# Analyze the relationship between in-sample Sharpe, DSR, and out-of-sample Sharpe
# This visualization reveals whether DSR provides useful information about future performance
performance_results %>%
ggplot(aes(x = in_sample_sharpe, y = out_sample_sharpe, color = dsr, shape = has_edge)) +
geom_point(size = 3, alpha = 0.7) +
scale_color_gradient(low = "red", high = "blue") + # Color by DSR value
labs(
title = "Out-of-Sample vs. In-Sample Sharpe Ratio",
subtitle = "Color indicates Deflated Sharpe Ratio, Shape indicates True Edge",
x = "In-Sample Sharpe Ratio",
y = "Out-of-Sample Sharpe Ratio",
color = "DSR",
shape = "Has True Edge"
) +
theme_minimal() +
# Add regression line to show overall relationship
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") +
# Add reference lines at zero
geom_hline(yintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 0, linetype = "dotted") +
# Add custom shape scale
scale_shape_manual(values = c("FALSE" = 16, "TRUE" = 17))# Group strategies by DSR to analyze performance patterns
# This helps us understand whether different DSR ranges predict different outcomes
performance_results <- performance_results %>%
mutate(dsr_group = cut(dsr, breaks = c(0, 0.2, 0.5, 0.8, 1),
labels = c("Very Low (0-0.2)",
"Low (0.2-0.5)",
"Medium (0.5-0.8)",
"High (0.8-1)")))
# Calculate average out-of-sample performance by DSR group
# This quantifies the relationship between DSR category and future returns
dsr_group_performance <- performance_results %>%
group_by(dsr_group) %>%
summarise(
count = n(), # Number of strategies in each group
avg_in_sample = mean(in_sample_sharpe), # Average backtest performance
avg_out_sample = mean(out_sample_sharpe), # Average realized performance
median_out_sample = median(out_sample_sharpe), # Median (more robust to outliers)
positive_rate = mean(out_sample_sharpe > 0), # Proportion with positive returns
true_edge_rate = mean(has_edge) # Proportion with true edge
) %>%
arrange(dsr_group) # Sort by DSR group for readability
# Display the results in a nicely formatted table
kable(dsr_group_performance,
caption = "Out-of-Sample Performance by DSR Group",
digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| dsr_group | count | avg_in_sample | avg_out_sample | median_out_sample | positive_rate | true_edge_rate |
|---|---|---|---|---|---|---|
| Very Low (0-0.2) | 53 | -0.047 | -0.012 | -0.018 | 0.434 | 0.019 |
| Low (0.2-0.5) | 5 | 0.009 | 0.015 | 0.025 | 0.600 | 0.000 |
| Medium (0.5-0.8) | 4 | 0.012 | -0.019 | -0.009 | 0.500 | 0.000 |
| High (0.8-1) | 38 | 0.072 | 0.008 | -0.007 | 0.447 | 0.105 |
# Visualize out-of-sample performance by DSR group using boxplots
# This shows the full distribution, not just averages
ggplot(performance_results, aes(x = dsr_group, y = out_sample_sharpe, fill = has_edge)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Out-of-Sample Sharpe Ratio by DSR Group",
x = "Deflated Sharpe Ratio Group",
y = "Out-of-Sample Sharpe Ratio",
fill = "Has True Edge"
) +
theme_minimal() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") + # Reference line at zero
scale_fill_manual(values = c("FALSE" = "steelblue", "TRUE" = "darkred"))The scatterplot above explores a fundamental question in quantitative finance: Does the Deflated Sharpe Ratio (DSR) predict future (out-of-sample) strategy performance?
Colour indicates DSR: Strategies coloured in blue have higher DSR values (closer to 1), suggesting a high confidence in their genuine predictive ability based on in-sample analysis. Conversely, strategies coloured red (low DSR values) indicate strategies more likely to be statistical artefacts or false discoveries.
Shape indicates whether a strategy has a true edge: Triangles show strategies deliberately given an actual performance edge. Circles represent purely random strategies without any real edge.
High-DSR Strategies Perform Better Out-of-Sample: The strategies with the highest DSR (blue points) generally cluster in areas with positive out-of-sample performance. This indicates that the DSR effectively identifies strategies that not only performed well historically (in-sample) but also continue to perform well in the future.
False Strategies Exhibit Poor Out-of-Sample Performance: Many strategies with low DSR (red points) or those identified as false strategies (circles) show substantial deterioration from in-sample to out-of-sample performance, illustrating a classic regression-to-the-mean effect. These strategies, despite occasionally impressive in-sample results, fail when implemented in live conditions.
DSR Provides Incremental Information Beyond Sharpe Ratio Alone: Importantly, the DSR colour gradient reveals that even some strategies with high in-sample Sharpe ratios can have low DSR values, indicating they are less likely to replicate their success out-of-sample. Thus, the DSR serves as an additional layer of quality control beyond the Sharpe ratio.
This analysis clearly demonstrates the value of the DSR in strategy evaluation. Practitioners can leverage the DSR to effectively differentiate strategies that have a robust predictive edge from those likely to fail due to selection bias or statistical randomness. Incorporating DSR into strategy selection processes can significantly enhance investment decisions, reducing costly errors from overfitting or selection bias.
Let’s evaluate a simple moving average crossover strategy to demonstrate the DSR calculation process in a realistic context.
# Set seed for reproducibility
set.seed(123)
# Generate market data
n_days <- 1000
market_returns <- rnorm(n_days, mean = 0.0004, sd = 0.01) # ~10% annual return, 16% vol
prices <- 100 * cumprod(1 + market_returns)
# Create strategy: Buy when short MA crosses above long MA, sell when it crosses below
short_window <- 20
long_window <- 50
# Calculate moving averages
short_ma <- zoo::rollapply(prices, short_window, mean, fill = NA)
long_ma <- zoo::rollapply(prices, long_window, mean, fill = NA)
# Generate signals: 1 for long, -1 for short, 0 for no position
signals <- rep(0, n_days)
for (i in (long_window+1):n_days) {
# Check if we have valid values for comparison
if (!is.na(short_ma[i]) && !is.na(short_ma[i-1]) &&
!is.na(long_ma[i]) && !is.na(long_ma[i-1])) {
if (short_ma[i] > long_ma[i] && short_ma[i-1] <= long_ma[i-1]) {
signals[i] <- 1 # Buy signal
} else if (short_ma[i] < long_ma[i] && short_ma[i-1] >= long_ma[i-1]) {
signals[i] <- -1 # Sell signal
} else {
signals[i] <- signals[i-1] # Maintain previous position
}
} else {
signals[i] <- 0 # No signal when we don't have enough data
}
}
# Convert signals to position vector (1 = long, -1 = short, 0 = neutral)
positions <- rep(0, n_days)
for (i in (long_window+1):n_days) {
if (signals[i] == 1) positions[i] <- 1
else if (signals[i] == -1) positions[i] <- -1
else positions[i] <- positions[i-1]
}
# Calculate strategy returns
strategy_returns <- c(0, positions[-n_days] * market_returns[-1])
# Analyze basic performance
sharpe_standard <- mean(strategy_returns[(long_window+1):n_days]) /
sd(strategy_returns[(long_window+1):n_days]) * sqrt(252)
# Now let's assume this strategy was selected from 50 different parameter combinations
# Calculate DSR
dsr_result <- calculate_dsr(
strategy_returns[(long_window+1):n_days],
n_trials = 50, # 50 different MA combinations were tested
sr_std = sd(strategy_returns[(long_window+1):n_days]) * sqrt(252)
)
# Display results
cat("Traditional Sharpe Ratio (annualized):", round(sharpe_standard, 2), "\n")Traditional Sharpe Ratio (annualized): 0.12
cat("Expected Max Sharpe Ratio from 50 trials:",
round(dsr_result$expected_max_sr * sqrt(252), 2), "\n")Expected Max Sharpe Ratio from 50 trials: 5.69
cat("Deflated Sharpe Ratio:", round(dsr_result$dsr, 4), "\n")Deflated Sharpe Ratio: 0
# Visualize strategy performance
# Create data frame with cumulative returns
performance_data <- data.frame(
Day = 1:n_days,
Price = prices,
ShortMA = short_ma,
LongMA = long_ma,
Position = positions,
CumulativeReturn = cumprod(1 + strategy_returns) - 1,
MarketCumulativeReturn = cumprod(1 + market_returns) - 1
)
# Plot price and moving averages
p1 <- ggplot(performance_data, aes(x = Day)) +
geom_line(aes(y = Price), color = "black") +
geom_line(aes(y = ShortMA), color = "blue", linetype = "solid") +
geom_line(aes(y = LongMA), color = "red", linetype = "solid") +
labs(
title = "Price and Moving Averages",
x = "Trading Day",
y = "Price"
) +
theme_minimal()
# Plot strategy cumulative returns
p2 <- ggplot(performance_data, aes(x = Day)) +
geom_line(aes(y = CumulativeReturn, color = "Strategy")) +
geom_line(aes(y = MarketCumulativeReturn, color = "Market")) +
labs(
title = "Cumulative Returns",
subtitle = paste("Sharpe:", round(sharpe_standard, 2),
"DSR:", round(dsr_result$dsr, 4)),
x = "Trading Day",
y = "Cumulative Return",
color = "Series"
) +
theme_minimal() +
scale_color_manual(values = c("Strategy" = "blue", "Market" = "black"))
# Display plots
gridExtra::grid.arrange(p1, p2, ncol = 1)The plots clearly illustrate the pitfalls of relying solely on traditional performance metrics like the Sharpe ratio when evaluating investment strategies:
Visual Assessment: The top plot shows typical market price fluctuations and moving averages. Visually, the strategy (buying/selling based on crossovers) seems plausible but generates only random entry and exit signals.
Performance Reality Check: In the cumulative returns plot, the strategy’s performance is consistently weaker than the underlying market, despite occasional periods of positive returns. This shows how random strategies can appear superficially promising.
Quantitative Confirmation by DSR: While the strategy’s annualised Sharpe ratio (0.12) might appear mildly attractive in isolation, the DSR of 0.00 unequivocally highlights that this performance is illusory. Given that 50 strategies were tested, the observed small positive performance is precisely what we would expect due to selection bias, not genuine predictive power.
In this tutorial, we have critically explored the issue of selection bias in quantitative investment research, demonstrating how multiple testing inflates performance metrics. Key insights include:
This analysis reinforces the need for rigorous statistical approaches, like DSR, to safeguard against backtest overfitting and selection bias, thus improving investment strategy decisions.
Backtest Overfitting: Excessive tailoring of a strategy to historical data, capturing noise rather than genuine patterns, resulting in poor future performance.
Deflated Sharpe Ratio (DSR): A performance metric correcting the Sharpe ratio for multiple-testing bias, skewness, kurtosis, and sample size.
False Discovery Rate (FDR): Expected proportion of false-positive findings among strategies identified as profitable.
False Strategy Theorem: Demonstrates the expected inflation in Sharpe ratios purely due to multiple testing.
Multiple Testing Problem: Statistical challenge arising when evaluating numerous strategies simultaneously, increasing the risk of false positives.
Precision: Probability a strategy identified as profitable truly has predictive power.
Recall: Probability a genuinely profitable strategy is identified as profitable in tests.
Selection Bias under Multiple Testing (SBuMT): Performance inflation occurring when only the best-performing strategies are reported among many tested.
Sharpe Ratio: Ratio of excess returns to volatility, commonly used as a measure of risk-adjusted return.
Type I Error (False Positive): Incorrectly concluding a strategy has predictive power.
Type II Error (False Negative): Failing to identify a genuinely profitable strategy.
Sharpe Ratio Distribution: Generate 500 random strategies with 126 days of returns. Plot their Sharpe ratios. Compare these results to 252-day samples. How does reducing sample size affect performance metrics?
Expected Maximum Sharpe: Use expected_max_sharpe to plot how expected Sharpe ratios increase with trials: test 10, 50, 100, and 500 trials.
generate_random_strategies to introduce genuine edges:Compare effectiveness of both methods.
Report your findings clearly using visuals.
Evaluate out-of-sample performance for each approach.
Assess performance across different market regimes and compare DSR assessment to traditional metrics.
Here’s an updated, clear, and structured Online Resources section with verified links for your tutorial: